#!/usr/bin/python3
# -*- coding: UTF-8 -*-

# Python3 标准库
# https://docs.python.org/zh-cn/3/library/index.html

# 解析HTML
# 1: 正则表达式大法
# 2: requests-html pip install requests-html
# 3: BeautifulSoup pip install beautifulsoup4
# 4: lxml.XPath    pip install lxml
# [](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9pbWcxLnR1aWNvb2wuY29tL25peUlSYkouanBnIXdlYg?x-oss-process=image/format,png)
# 5: SGMLParser
# 6: HTMLParaer

# version: 3.11.4
# 执行
# 创建空白__init__.py


import requests
import os
import sys
import time
import datetime
import html
import random
import math
import re
import json
import urllib
import shutil
import glob
import zlib
import doctest
import hashlib
from pathlib import Path
from io import StringIO
from bs4 import BeautifulSoup


def get_document_from_url(url):
    request = urllib.request.request(url)
    # request.add_header('Host','www.biququ.la')
    # request.add_header('Referer','https://www.biququ.la/html/27744/')
    request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0')
    response = urllib.request.urlopen(request)
    response.gbk = 'utf-8'
    if response.code == 200:
        pass
    else:
        print("访问%s失败，返回码为:%s", url, response.code)
        pass

    # utf-8解码，得到中文
    # print(response.read().decode('gbk'))
    html_string = html.unescape(response.read().decode('gbk'))
    document = BeautifulSoup(html_string, 'html.parser')
    # print(document)
    return document


pass


def parse_document(document, titleFlag):
    chapter = ""
    title_list = document.body.select("div.bookname h1")
    if len(title_list) == 0:
        return
    title = title_list[0].text
    if titleFlag:
        chapter += title
        chapter += "\n"
    content_list = document.body.select("div#content")
    for content in content_list:
        chapter += content.text
        chapter += "\n"
        pass
    chapter += "\n"
    # print(chapter)
    return chapter


pass


def write2file(chapter, file):
    fo = open(file, "a+", "utf8")
    fo.write(chapter)
    fo.close


pass


def webcrawler():
    file = "msdyw-ycqs-63shu.txt"
    for i in range(21953942, 21954956):
        url_format = "http://www.63shu.com/109/109921/%d.html"
        url = url_format % i
        document = get_document_from_url(url);
        chapter = parse_document(document, True)
        write2file(chapter, file)
        url_format = "http://www.63shu.com/109/109921/%d_2.html"
        url = url_format % i
        document = get_document_from_url(url);
        chapter = parse_document(document, False)
        write2file(chapter, file)
        pass
    print("finish")


pass

webcrawler()

# biququ
# http://www.63shu.com/109/109921/21953942.html
# http://www.63shu.com/109/109921/21954956.html
